First we load the required dataset.

library(readr)
df <- read_csv('NBA_PLAYERS.csv')
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   TEAM = col_character(),
##   NAME = col_character(),
##   URL = col_character(),
##   POSITION = col_character(),
##   AGE = col_character(),
##   COLLEGE = col_character(),
##   SALARY = col_character(),
##   FGM_FGA = col_character(),
##   THM_THA = col_character(),
##   FTM_FTA = col_character()
## )
## See spec(...) for full column specifications.

We can now view the dataset.

View(df)

To check the number of columns and type of data in each column we can use structure function.

str(df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 550 obs. of  30 variables:
##  $ TEAM           : chr  "Boston Celtics" "Boston Celtics" "Boston Celtics" "Boston Celtics" ...
##  $ NAME           : chr  "Aron Baynes" "Justin Bibbs" "Jabari Bird" "Jaylen Brown" ...
##  $ EXPERIENCE     : num  6 0 1 2 1 2 8 11 7 0 ...
##  $ URL            : chr  "http://www.espn.com/nba/player/_/id/2968439" "http://www.espn.com/nba/player/_/id/3147500" "http://www.espn.com/nba/player/_/id/3064308" "http://www.espn.com/nba/player/_/id/3917376" ...
##  $ POSITION       : chr  "SF" "G" "SG" "F" ...
##  $ AGE            : chr  "31" "22" "24" "21" ...
##  $ HT             : num  208 196 198 201 198 ...
##  $ WT             : num  117.7 99.5 89.6 99.5 92.8 ...
##  $ COLLEGE        : chr  "Washington State" "Virginia Tech" "California" "California" ...
##  $ SALARY         : chr  "5,193,600" "Not signed" "1,349,464" "5,169,960" ...
##  $ PPG_LAST_SEASON: num  6 0 3 14.5 1 1.4 2 12.9 24.4 0 ...
##  $ APG_LAST_SEASON: num  1.1 0 0.6 1.6 0 0.2 0 7.4 5.1 0 ...
##  $ RPG_LAST_SEASON: num  5.4 0 1.5 4.9 0.5 0.4 1 1.1 3.8 0 ...
##  $ PER_LAST_SEASON: num  12.09 0 12.18 13.69 -4.82 ...
##  $ PPG_CAREER     : num  5.4 0 3 10.4 1 1.6 15.6 14.2 22 0 ...
##  $ APG_CAREER     : num  0.7 0 0.6 1.2 0 0.2 3.4 8.6 5.5 0 ...
##  $ RGP_CAREER     : num  4.4 0 1.5 3.8 0.5 0.5 4.2 1.2 3.4 0 ...
##  $ GP             : num  376 0 13 148 2 47 517 718 441 0 ...
##  $ MPG            : num  15 0 8.8 23.6 1.5 5.8 31.3 33.2 33.9 0 ...
##  $ FGM_FGA        : chr  "2.2-4.3" "0" "1.2-2.0" "3.8-8.3" ...
##  $ FGP            : num  0.502 0 0.577 0.461 0.5 0.418 0.444 0.525 0.462 0 ...
##  $ THM_THA        : chr  "0.0-0.1" "0" "0.2-0.5" "1.1-3.0" ...
##  $ THP            : num  0.143 0 0.429 0.379 0 0.294 0.368 0.37 0.388 0 ...
##  $ FTM_FTA        : chr  "1.0-1.3" "0" "0.5-1.0" "1.6-2.4" ...
##  $ FTP            : num  0.802 0 0.462 0.658 0 0.71 0.82 0.75 0.875 0 ...
##  $ APG            : num  0.7 0 0.6 1.2 0 0.2 3.4 3.2 5.5 0 ...
##  $ BLKPG          : num  0.5 0 0.1 0.3 0 0 0.4 1.2 0.3 0 ...
##  $ STLPG          : num  0.2 0 0.2 0.7 0 0.1 1 0.8 1.3 0 ...
##  $ TOPG           : num  0.8 0 0.6 1.3 0.5 0.1 2 1.6 2.7 0 ...
##  $ PPG            : num  5.4 0 3 10.4 1 1.6 15.6 14.2 22 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   TEAM = col_character(),
##   ..   NAME = col_character(),
##   ..   EXPERIENCE = col_double(),
##   ..   URL = col_character(),
##   ..   POSITION = col_character(),
##   ..   AGE = col_character(),
##   ..   HT = col_double(),
##   ..   WT = col_double(),
##   ..   COLLEGE = col_character(),
##   ..   SALARY = col_character(),
##   ..   PPG_LAST_SEASON = col_double(),
##   ..   APG_LAST_SEASON = col_double(),
##   ..   RPG_LAST_SEASON = col_double(),
##   ..   PER_LAST_SEASON = col_double(),
##   ..   PPG_CAREER = col_double(),
##   ..   APG_CAREER = col_double(),
##   ..   RGP_CAREER = col_double(),
##   ..   GP = col_double(),
##   ..   MPG = col_double(),
##   ..   FGM_FGA = col_character(),
##   ..   FGP = col_double(),
##   ..   THM_THA = col_character(),
##   ..   THP = col_double(),
##   ..   FTM_FTA = col_character(),
##   ..   FTP = col_double(),
##   ..   APG = col_double(),
##   ..   BLKPG = col_double(),
##   ..   STLPG = col_double(),
##   ..   TOPG = col_double(),
##   ..   PPG = col_double()
##   .. )

We can check if any of the columns has any missing data or NA values with the following code.

unique_elements = lapply(df,unique)
lapply(lapply(unique_elements,is.na),sum)
## $TEAM
## [1] 0
## 
## $NAME
## [1] 0
## 
## $EXPERIENCE
## [1] 0
## 
## $URL
## [1] 0
## 
## $POSITION
## [1] 0
## 
## $AGE
## [1] 0
## 
## $HT
## [1] 0
## 
## $WT
## [1] 0
## 
## $COLLEGE
## [1] 0
## 
## $SALARY
## [1] 0
## 
## $PPG_LAST_SEASON
## [1] 1
## 
## $APG_LAST_SEASON
## [1] 1
## 
## $RPG_LAST_SEASON
## [1] 1
## 
## $PER_LAST_SEASON
## [1] 1
## 
## $PPG_CAREER
## [1] 0
## 
## $APG_CAREER
## [1] 0
## 
## $RGP_CAREER
## [1] 0
## 
## $GP
## [1] 0
## 
## $MPG
## [1] 0
## 
## $FGM_FGA
## [1] 0
## 
## $FGP
## [1] 0
## 
## $THM_THA
## [1] 0
## 
## $THP
## [1] 0
## 
## $FTM_FTA
## [1] 0
## 
## $FTP
## [1] 0
## 
## $APG
## [1] 0
## 
## $BLKPG
## [1] 0
## 
## $STLPG
## [1] 0
## 
## $TOPG
## [1] 0
## 
## $PPG
## [1] 0

Now we see that 4 columns have missing data.These 4 columns are actually those containing data about last years statistics .Since in our analysis we are not making any comparisons based on time series we can drop these columns.

df$PPG_LAST_SEASON = NULL
df$APG_LAST_SEASON = NULL
df$RPG_LAST_SEASON = NULL
df$PER_LAST_SEASON = NULL

Now we check if the age column has any non numeric data and replace it with the mean player age

unique(df$AGE)
##  [1] "31" "22" "24" "21" "28" "32" "26" "23" "29" "20" "25" "33" "27" "19"
## [15] "-"  "30" "34" "18" "36" "37" "35" "40" "38" "41"
df$AGE[df$AGE == '-'] = 0
df$AGE = sapply(df$AGE,as.numeric)
mean_age = mean(df$AGE)
df$AGE[df$AGE == 0] = round(mean_age)

Now we also see that the Salary column has a value which says “not signed”.This means that the particular player does not have a contract yet ,hence we replace his salary with 0.

unique(df$SALARY)
##   [1] "5,193,600"  "Not signed" "1,349,464"  "5,169,960"  "31,214,295"
##   [6] "28,928,709" "20,099,189" "5,375,000"  "1,378,242"  "3,050,390" 
##  [11] "11,660,716" "6,700,800"  "838,464"    "2,667,600"  "2,034,120" 
##  [16] "15,400,000" "18,500,000" "4,449,000"  "1,656,092"  "9,530,000" 
##  [21] "13,764,045" "1,512,601"  "8,000,000"  "2,470,357"  "1,618,320" 
##  [26] "1,702,800"  "1,632,240"  "1,942,422"  "7,019,698"  "5,000,000" 
##  [31] "4,544,000"  "1,795,015"  "17,325,000" "6,500,000"  "18,622,514"
##  [36] "3,739,920"  "1,619,260"  "12,253,780" "4,294,480"  "4,155,720" 
##  [41] "5,697,054"  "1,485,440"  "7,119,650"  "8,575,916"  "12,800,562"
##  [46] "10,464,092" "25,467,250" "8,339,880"  "1,740,000"  "1,600,520" 
##  [51] "12,250,000" "2,526,840"  "1,703,649"  "6,434,520"  "10,000,000"
##  [56] "21,666,667" "23,114,067" "31,200,000" "8,333,333"  "1,826,300" 
##  [61] "9,367,200"  "1,569,360"  "1,544,951"  "16,539,326" "8,653,847" 
##  [66] "2,536,898"  "5,337,000"  "37,457,154" "30,000,000" "1,644,240" 
##  [71] "17,469,565" "16,000,000" "8,307,692"  "18,988,725" "5,027,028" 
##  [76] "12,000,000" "21,587,579" "3,375,360"  "13,565,218" "6,000,000" 
##  [81] "14,800,000" "6,134,520"  "7,000,000"  "4,320,500"  "3,046,200" 
##  [86] "6,300,000"  "1,349,383"  "7,461,960"  "3,500,000"  "1,000,000" 
##  [91] "1,655,160"  "5,757,120"  "35,654,150" "1,689,840"  "1,487,694" 
##  [96] "9,000,000"  "1,762,080"  "20,421,546" "15,000,000" "7,464,912" 
## [101] "8,165,160"  "4,661,280"  "3,314,365"  "3,552,960"  "13,585,000"
## [106] "3,258,539"  "6,041,520"  "949,000"    "1,238,464"  "11,750,000"
## [111] "7,305,600"  "4,696,875"  "3,000,000"  "5,470,920"  "2,207,040" 
## [116] "3,844,760"  "2,807,880"  "8,739,500"  "5,460,000"  "11,692,308"
## [121] "11,011,234" "11,286,516" "4,441,200"  "4,221,000"  "8,740,980" 
## [126] "4,384,616"  "1,990,520"  "19,500,000" "14,357,750" "4,536,120" 
## [131] "20,000,000" "3,263,294"  "2,494,346"  "2,280,600"  "12,500,000"
## [136] "2,760,095"  "19,000,000" "3,472,887"  "7,560,000"  "24,119,025"
## [141] "2,272,391"  "2,775,000"  "4,068,600"  "14,720,000" "1,952,760" 
## [146] "2,500,000"  "25,434,263" "1,857,480"  "32,088,932" "17,043,478"
## [151] "3,940,402"  "3,275,280"  "10,002,681" "4,075,000"  "10,500,000"
## [156] "12,400,000" "1,911,960"  "7,945,000"  "2,407,560"  "7,333,333" 
## [161] "21,000,000" "2,659,800"  "3,410,284"  "13,964,045" "24,157,303"
## [166] "1,641,000"  "9,607,500"  "2,481,000"  "11,327,466" "3,382,000" 
## [171] "2,799,720"  "13,000,000" "10,607,143" "2,534,280"  "3,710,850" 
## [176] "24,107,258" "1,230,000"  "6,560,640"  "22,897,200" "9,631,250" 
## [181] "3,819,960"  "15,293,104" "3,206,160"  "1,621,415"  "13,500,375"
## [186] "35,650,150" "3,651,480"  "14,631,250" "7,969,537"  "8,641,000" 
## [191] "30,521,115" "7,666,667"  "5,915,040"  "5,285,394"  "12,252,928"
## [196] "25,976,111" "2,205,000"  "8,808,685"  "1,567,707"  "22,347,015"
## [201] "6,153,846"  "2,487,000"  "27,739,975" "3,125,000"  "16,800,000"
## [206] "10,087,200" "11,571,429" "2,947,320"  "2,357,160"  "1,667,160" 
## [211] "2,516,048"  "18,089,887" "1,634,640"  "2,299,080"  "7,200,000" 
## [216] "2,250,960"  "4,350,000"  "13,766,421" "1,620,480"  "5,356,440" 
## [221] "24,000,000" "17,000,000" "3,206,640"  "988,464"    "3,627,842" 
## [226] "7,488,372"  "3,447,480"  "14,087,500" "13,528,090" "2,955,840" 
## [231] "18,109,175" "6,270,000"  "14,651,700" "19,245,370" "12,537,527"
## [236] "11,550,000" "25,434,262" "3,448,926"  "7,250,000"  "4,865,040" 
## [241] "1,050,000"  "21,590,909" "2,639,314"  "4,969,080"  "2,416,222" 
## [246] "12,750,000" "2,749,080"  "15,944,154" "3,454,500"  "8,600,000" 
## [251] "3,208,630"  "26,011,913" "12,650,000" "3,129,187"  "5,450,000" 
## [256] "19,169,800" "11,830,358" "1,773,840"  "2,000,000"  "16,517,857"
## [261] "2,166,360"  "24,605,181" "1,874,640"  "3,364,249"  "29,230,769"
## [266] "3,499,800"  "12,917,808" "2,894,160"  "20,445,779" "15,170,787"
## [271] "14,000,000" "2,444,053"  "2,160,746"  "4,750,000"  "7,839,435" 
## [276] "5,455,236"  "2,118,840"  "30,560,700" "1,757,429"  "5,451,600" 
## [281] "15,500,000" "6,957,105"  "3,628,920"  "2,795,000"  "10,837,079"
## [286] "10,595,506" "27,977,689" "25,759,766" "11,111,111" "1,760,520" 
## [291] "17,868,853" "2,074,320"  "1,679,520"  "11,536,515" "7,305,825" 
## [296] "9,600,000"  "16,900,000" "23,241,573" "13,045,455" "3,111,480" 
## [301] "2,150,000"  "14,975,000" "5,250,000"  "3,360,000"
df$SALARY[df$SALARY == "Not signed"] = "0"
df$SALARY = as.numeric(gsub(",", "", df$SALARY))
df$COLLEGE[grep("-",df$COLLEGE)] = "Others"

We will not be using the URL column as it has some external links.We can drop it.

df$URL = NULL

We see that the columns FGM_FGA(Field goals made vs Field goals attempted) has data as a string with yphens.We are interested to know the ration of these numbers in the column.This ratio is directly indicated in the FGP(Field goal percentage).Similarly THM_THA and FTM_FTA can be represented by THP and FTP.Now since we have columns with required ratios we can drop redundant columns.

df$FTM_FTA = NULL
df$FGM_FGA = NULL
df$THM_THA = NULL

Since the ppg ,apg are redundant with columns representing same statistics exist for career.

df$PPG = NULL
df$APG = NULL

Player with the maximum Salary(considering only the players who have revealed their salary to ESPN).

df$NAME[df$SALARY == max(df$SALARY)]
## [1] "Stephen Curry"

Calculating count of players based on the given grouping

Plot of distribution of experience in the league

library("plotly")
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(df[,3], labels = labels, values = ex, type = 'pie') %>%
        layout(title = 'Experience of players in the NBA as of 2018-2019',
        xaxis = list(showgrid = FALSE, zeroline = FALSE),
        yaxis = list(showgrid = FALSE, zeroline = FALSE))

From the above pie chart we can see that majority of the players are fairly young having very little experience playing in the league.We can also see that there are very few players who have been in the league for more than 15 years.

Calculating the number of players in the league based on college they attended.

c = sort(df$COLLEGE)
c = as.data.frame(table(c))
colnames(c) = c("College","Frequency")
#c = c[ -c(1),]
is_applicable = vector()
for (i in 1:length(c$Frequency)) {
  if ( c$Frequency[i] > 8)
    is_applicable[i] = TRUE
  else
    is_applicable[i] = FALSE
}
colors = vector(mode = "character",length = 30)
for (i in 1:length(c$Frequency)) {
  if ( is_applicable[i])
    colors[i] = "rgba(0,255,0,0.7)"
  else
    colors[i] = "rgba(255,0,0,0.7)"
}

Plot of number of players vs university attended

p = plot_ly(x = ~c$College,y = c$Frequency,marker = list(color = colors))
p = layout(p,title = "Number Of Players vs University Attended",xaxis = list(title = "University",type = "category"),yaxis = list(title = "Frequency"))
p
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#bar

From the above plot,we can see that most of the players in the may not have attended college in the USA.The University of Kentucky has the most NBA players among the universities situated in the USA.

Calculating and plotting gross salary per team in the NBA for the 2018-19 season

team_gross_salary = tapply(df$SALARY, df$TEAM, sum)
teams = unique(df$TEAM)
sal_cap = 101900000
sals = data.frame(teams,team_gross_salary)
colnames(sals) = c("Team","Gross Salary")
is_applicable = vector()
for (i in 1:length(team_gross_salary)) {
  if ( team_gross_salary[i] > sal_cap)
    is_applicable[i] = TRUE
  else
    is_applicable[i] = FALSE
}
colors = vector(mode = "character",length = 30)
for (i in 1:length(team_gross_salary)) {
  if ( is_applicable[i])
    colors[i] = "rgba(255,0,0,0.7)"
  else
    colors[i] = "rgba(0,255,0,0.7)"
}
hline <- function(y,color = "black") {
  list(
    type = "line", 
    name = "NBA 2018-19 Salary Cap",
    x0 = 0, 
    x1 = 1, 
    xref = "paper",
    y0 = y, 
    y1 = y, 
    line = list(color = color)
  )
}
p <- plot_ly(sals,x = ~teams,y = ~team_gross_salary,type = "bar" ,marker = list(color = colors))
p = layout(p,title = "Gross Salary per team",xaxis = list(title = "Teams"),yaxis = list(title = "Gross Salary"))
p <- layout(p,shapes = list(hline(sal_cap)))
p

From the above plot,we can see that majority of the teams have exceeded the NBA salary cap and do not have much cap space to sign new players.

nba_mean_age = mean(df$AGE)
team_mean_age = tapply(df$AGE, df$TEAM, mean)
ages = data.frame(teams,team_mean_age)
colnames(sals) = c("Team","Team Mean Age")
is_applicable = vector()
for (i in 1:length(team_mean_age)) {
  if ( team_mean_age[i] > nba_mean_age)
    is_applicable[i] = TRUE
  else
    is_applicable[i] = FALSE
}
colors = vector(mode = "character",length = 30)
for (i in 1:length(team_mean_age)) {
  if ( is_applicable[i])
    colors[i] = "rgba(255,0,0,0.7)"
  else
    colors[i] = "rgba(0,255,0,0.7)"
}
hline <- function(y,color = "blue") {
  list(
    type = "line", 
    title = "NBA 2018-19 Mean Age",
    name = "NBA 2018-19 Mean Age",
    x0 = 0, 
    x1 = 1, 
    xref = "paper",
    y0 = y, 
    y1 = y, 
    line = list(color = color)
  )
}
p <- plot_ly(ages,x = ~teams,y = ~team_mean_age,type = "bar" ,marker = list(color = colors))
p = layout(p,title = "Mean Age of players per team",xaxis = list(title = "Teams"),yaxis = list(title = "Mean Age"))
p <- layout(p,shapes = list(hline(nba_mean_age)))
p

From the above plot,we can see that the mean player age of majority of the teams is less than the league’s mean player age.

Calculating top earner from each team

#install.packages("dplyr")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
max_sal = function(group){
  group[which.max(group)]
}
top_earners = group_by(df)
top_earners = top_earners[order(top_earners$TEAM),]
sals = as.data.frame(tapply(top_earners$SALARY,top_earners$TEAM,max_sal))
colnames(sals) = c("MaxSalary")
top_earners = top_earners %>% filter(SALARY %in% sals$MaxSalary)
top_earners = top_earners[-c(8,17,18,24),]
pos = unique(top_earners$POSITION)
counts = count(top_earners,top_earners$POSITION)
colnames(counts) = c("Position","No of Players")
colors = c("rgb(150, 50, 180)","rgb(255, 127, 14)","rgb(44, 160, 44)","rgb(214, 39, 40)","rgb(140, 86, 75)")

Plotting top earner vs position played

p = plot_ly(counts,x = ~pos,y = ~counts$`No of Players`,type = "bar",marker = list(color = colors ))
p = layout(p,title = "Distribution of positions of top earners in the NBA",xaxis = list(title = "Position"),yaxis = list(title = "Number of Players"))
p

We can see that the league has quite a few high earning players who are Power Forwards and Small Forwards.

To check the correlation between the columns we have to drop the non numeric columns.

a = df
a$TEAM = NULL
a$NAME = NULL
a$COLLEGE = NULL
a$POSITION = NULL

Constructing a correlation plot and a correlation matrix to check the and visualize correlation.

#install.packages("corrplot")
library("corrplot")
## corrplot 0.84 loaded
Matrix = cor(a)
corrplot(Matrix,method = "circle")

To check How salary depends on other columns we check the columns having a correlation of more than 0.6.

Matrix[5,] > 0.6
## EXPERIENCE        AGE         HT         WT     SALARY PPG_CAREER 
##      FALSE      FALSE      FALSE      FALSE       TRUE       TRUE 
## APG_CAREER RGP_CAREER         GP        MPG        FGP        THP 
##      FALSE      FALSE      FALSE       TRUE      FALSE      FALSE 
##        FTP      BLKPG      STLPG       TOPG 
##      FALSE      FALSE       TRUE       TRUE

Splitting the data into 70% training and 30% test data.

train <- df[1:440,]
test <- df[440:550,]

We see that the columns PPG_career , MPG, STLPG,TOPG AFFECT THE sALARY.

model <- lm(SALARY~ PPG_CAREER+MPG+STLPG+TOPG,data = train)
summary(model)
## 
## Call:
## lm(formula = SALARY ~ PPG_CAREER + MPG + STLPG + TOPG, data = train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -19950526  -2948146     76553   1853379  18140458 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -76554     461693  -0.166  0.86838    
## PPG_CAREER   1133988     143784   7.887 2.53e-14 ***
## MPG           -92158      73342  -1.257  0.20959    
## STLPG        3492079    1276520   2.736  0.00648 ** 
## TOPG        -2212234     869204  -2.545  0.01127 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5374000 on 435 degrees of freedom
## Multiple R-squared:  0.501,  Adjusted R-squared:  0.4964 
## F-statistic: 109.2 on 4 and 435 DF,  p-value: < 2.2e-16

Having constructed a linear model with the following variables affecting the salary attribute, we see that the r-squared is not very high indicating the model is not the best we can arrive at(correlation does not mean or indicate causation). Howevever intuitively we see that the number of games played by a player has to affect the salary he receives.

model <- lm(SALARY~ PPG_CAREER+MPG+STLPG+TOPG+GP,data = train)
summary(model)
## 
## Call:
## lm(formula = SALARY ~ PPG_CAREER + MPG + STLPG + TOPG + GP, data = train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -21629597  -2574002    -88211   1707129  17525424 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    88211     455000   0.194  0.84637    
## PPG_CAREER   1027170     143478   7.159 3.49e-12 ***
## MPG          -159617      73808  -2.163  0.03112 *  
## STLPG        3601952    1253519   2.873  0.00426 ** 
## TOPG        -1833623     858191  -2.137  0.03319 *  
## GP              5103       1226   4.161 3.83e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5276000 on 434 degrees of freedom
## Multiple R-squared:  0.5202, Adjusted R-squared:  0.5146 
## F-statistic:  94.1 on 5 and 434 DF,  p-value: < 2.2e-16

We now see that adding the GP as one of the factors for the salary attribute,increases the r-squared indicating that the model is a better fit .Also we see that p-value indicated here is very very low.This means that the p-value is statistically significant at a confidence level of 99% also.This means we can reject the null hypothesis that the given attributes do not affect the salary of the player.Basically we can assume that there is a correlation between the salary and the above fields.

Calculating the correlation accuracy for the model

predicted1 <- predict(model,test)
act_pred1 <- data.frame(cbind(actuals = test$SALARY,predict = predicted1))
cor_acc <- cor(act_pred1)
print(paste0("Correlation accuracy=",cor_acc[1,2]))
## [1] "Correlation accuracy=0.706123516422658"

We see that the correlation accuracy is 70.6% which is is not very good but reasonable.

Plotting the residuals vs Fitted values and also the normal Q-Q plot to check the variance ,linear relationship and the normality of residuals.

par(mfrow = c(2, 2))
plot(model)

We see that in the residuals plot the line at 0 is not linear exactly showing there does not completely exist a linear relationship for the linear regression model we have made.However considerinng most part of it as linear we observe heteroscedasticity as there is unequal variance on both sides of the line.The Q-Q plot actually shows a reasonable fit showing the residuals distribution to be almost normal. Hence we can conclude, that the model we have coctructed is not a very good estimator of the players’ salary as a linear model is not sufficient in this case.

Constructing another model to predict the games played by a player in his career based on his age and experience.

model2 <- lm(GP~EXPERIENCE+AGE,data = train)
summary(model)
## 
## Call:
## lm(formula = SALARY ~ PPG_CAREER + MPG + STLPG + TOPG + GP, data = train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -21629597  -2574002    -88211   1707129  17525424 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    88211     455000   0.194  0.84637    
## PPG_CAREER   1027170     143478   7.159 3.49e-12 ***
## MPG          -159617      73808  -2.163  0.03112 *  
## STLPG        3601952    1253519   2.873  0.00426 ** 
## TOPG        -1833623     858191  -2.137  0.03319 *  
## GP              5103       1226   4.161 3.83e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5276000 on 434 degrees of freedom
## Multiple R-squared:  0.5202, Adjusted R-squared:  0.5146 
## F-statistic:  94.1 on 5 and 434 DF,  p-value: < 2.2e-16

We now see that the r-squaredis very high (.971) indicating that the model is a very good fit .Also we see that p-value indicated here is very very low.This means that the p-value is statistically significant at a confidence level of 99% also.This means we can reject the null hypothesis that the given attributes do not affect the games played by the player.Basically we can assume that there is a strong correlation between the games played and the above fields.

Calculating the correlation accuracy for the model

predicted2 <- predict(model2,test)
act_pred2 <- data.frame(cbind(actuals = test$GP,predict = predicted2))
cor_acc2 <- cor(act_pred2)
print(paste0("Correlation accuracy=",cor_acc2[1,2]))
## [1] "Correlation accuracy=0.980395501874178"

The correlation accuracy is approx 98% which indicates the model is a very good fit.

Plotting the residuals vs Fitted values and also the normal Q-Q plot to check the variance ,linear relationship and the normality of residuals for the second model.

par(mfrow = c(2, 2))
plot(model2)

Now we see that residuals vs fitted values is slightly better showing a homoscedastic relationship and the Q-Q plot shows almost a normal distribution.

In conclusion the second model constructed to predict the Games played is a better fit and a decent model with high accuracy.